import covid19czechia as CZ
import covid19sweden as SE
import numpy as np
import plotly.express as px
import pandas as pd
from datetime import datetime, date, timedelta
# utility function
def week_num_to_date(year, week_num):
if year == 2020:
return(datetime.strptime(str(year) + str(week_num) + '-1', "%Y%W-%w") - timedelta(days = 7))
elif year == 2021:
return(datetime.strptime(str(year) + str(week_num) + '-1', "%Y%W-%w"))
# aggregate PL data
#file_name = './data/pl_google_sheet/' + datetime.today().strftime('%Y%m%d') + '.xlsx'
file_name = './data/pl_google_sheet/' + '20210317' + '.xlsx'
pl_cases = pd.read_excel(io = file_name,
sheet_name='Wzrost w województwach',
dtype=object,
engine='openpyxl',
skiprows=30,
nrows=16)
pl_value_vars = list(pl_cases.columns)[1:-2]
pl_cases = pd.melt(frame = pl_cases.iloc[:, 0:-2],
id_vars='Województwo',
value_vars=pl_value_vars,
var_name='date',
value_name='cases')
pl_cases['year'] = pl_cases['date'].apply(lambda x: int(x.year))
pl_cases['week'] = pl_cases['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_cases.loc[(pl_cases['week'] == 53) & (pl_cases['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_cases = pl_cases.groupby(['year', 'week', 'Województwo']).aggregate({'cases': 'sum'}).reset_index()
pl_cases['date'] = pl_cases.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
pl_deaths = pd.read_excel(io = file_name,
sheet_name='Wzrost w województwach',
dtype=object,
engine='openpyxl',
skiprows=70,
nrows=16)
pl_deaths = pd.melt(frame = pl_deaths.iloc[:, 0:-2],
id_vars='Województwo',
value_vars=pl_value_vars,
var_name='date',
value_name='deaths')
pl_deaths['year'] = pl_deaths['date'].apply(lambda x: int(x.year))
pl_deaths['week'] = pl_deaths['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_deaths.loc[(pl_deaths['week'] == 53) & (pl_deaths['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_deaths = pl_deaths.groupby(['year', 'week', 'Województwo']).aggregate({'deaths': 'sum'}).reset_index()
pl_deaths['date'] = pl_deaths.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
pl_tests_new = pd.read_excel(io = file_name,
sheet_name='Testy w województwach',
dtype=object,
engine='openpyxl',
skiprows=3,
nrows=16)
pl_tests_new = pd.melt(frame = pl_tests_new.iloc[:, 1:-2],
id_vars='Województwo',
value_vars=list(pl_tests_new.columns)[2:-2],
var_name='date',
value_name='tests')
pl_tests_new = pl_tests_new[pl_tests_new['date'] >= '2020-12-28'] # only pick up data from 28 Dec 2020 or later
pl_tests_new['year'] = pl_tests_new['date'].apply(lambda x: int(x.year))
pl_tests_new['week'] = pl_tests_new['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_tests_new.loc[(pl_tests_new['week'] == 53) & (pl_tests_new['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_tests_new = pl_tests_new.groupby(['year', 'week', 'Województwo']).aggregate({'tests': 'sum'}).reset_index()
pl_tests_new['date'] = pl_tests_new.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
pl_wkly_tests_old = pd.read_excel(io = file_name,
sheet_name=' Testy w województwach od 11.05', # sheet name appears to have changed to 'Testy w województwach od 11.05 do 28.12.2020'
dtype=object,
engine='openpyxl',
skiprows=2,
nrows=16,
verbose=False,
parse_dates=False,
date_parser=None)
pl_wkly_tests_old.iloc[12, 0] = 'Świętokrzyskie'
pl_wkly_tests_old = pd.melt(frame = pl_wkly_tests_old.iloc[:, 0:-5],
id_vars='Województwo',
value_vars=list(pl_wkly_tests_old.columns)[1:-5],
var_name='date',
value_name='tests')
pl_wkly_tests_old['year'] = pl_wkly_tests_old['date'].apply(lambda x: int(x.year))
pl_wkly_tests_old['week'] = pl_wkly_tests_old['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_wkly_tests_old = pl_wkly_tests_old[pl_wkly_tests_old['date'] < '2020-12-28'] # only pick up data from before 28 Dec 2020
pl_data = pd.merge(pd.merge(pl_cases, pl_deaths, on = ['Województwo', 'date', 'year', 'week'], how = 'left'),
pd.concat([pl_tests_new, pl_wkly_tests_old]), on = ['Województwo', 'date', 'year', 'week'], how = 'left') \
.rename({'Województwo':'name'}, axis = 1)
# aggregate SE data
se_data = SE.covid_deaths()
se_data = se_data.groupby(['year', 'week', 'region']) \
.aggregate({'deaths':'sum', 'confirmed':'sum'}).reset_index()
se_data['date'] = se_data.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
se_data = se_data.rename({'confirmed':'cases'}, axis = 1)
# aggregate CZ data
cz_data = CZ.covid_deaths(level = 2)
cz_data['year'] = cz_data['date'].apply(lambda x: x.year)
cz_data.loc[(cz_data['week'] == 53) & (cz_data['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
cz_data = cz_data.groupby(['year', 'week', 'region']).aggregate({'deaths': 'sum'}).reset_index()
cz_data['date'] = cz_data.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
# set up the regions dataset
regions = pd.read_csv("./data/regions.csv")
regions.loc[regions['NUTS3'].isin(['SE214', 'SE322', 'SE221', 'SE212', 'SE213', 'SE321', \
'SE332', 'SE331', 'SE312', 'SE311', 'SE313', 'SE124', \
'SE122', 'SE125', 'CZ041', 'SE231', 'SE211', 'SE121']), 'cluster_1'] = 1
regions.loc[regions['NUTS3'].isin(['SE123', 'CZ051', 'CZ063', 'CZ053', 'CZ052', 'CZ032', \
'CZ031', 'CZ072', 'CZ071']), 'cluster_1'] = 2
regions.loc[regions['NUTS3'].isin(['CZ042', 'PL52', 'PL43', 'PL84', 'CZ064', 'PL72', \
'CZ080', 'CZ020', 'SE224', 'PL62', 'PL42', 'SE232']), 'cluster_1'] = 3
regions.loc[regions['NUTS3'].isin(['PL61', 'PL82', 'PL81', 'PL63', 'PL71', 'SE110']), 'cluster_1'] = 4
regions.loc[regions['NUTS3'].isin(['PL51', 'PL21', 'PL41', 'PL22', 'PL9']), 'cluster_1'] = 5
regions.loc[regions['NUTS3'] == 'CZ010', 'cluster_1'] = 6
regions['cluster_2'] = regions['cluster_1'] - 1
regions.loc[regions['cluster_2'] == 0, 'cluster_2'] = 1
regions = regions.rename(columns = {'NUTS3':'region'})
# cross reference the PL/CZ/SE datasets with info in the regions file
pl_data = pd.merge(regions, pl_data, on='name', how="inner")
cz_data = pd.merge(regions, cz_data, on='region', how="inner")
se_data = pd.merge(regions, se_data, on='region', how="inner")
all_data = pd.concat([pl_data, cz_data, se_data])
# fix datatype of 'tests' column (handle NaN)
all_data['tests'] = all_data['tests'].astype('float')
# compute deaths and cases per 100K capita, deaths per test, cases per test
all_data['cases_100K'] = all_data['cases']/all_data['population']*100000
all_data['deaths_100K'] = all_data['deaths']/all_data['population']*100000
all_data['cases_per_test'] = all_data['cases']/all_data['tests']
all_data['deaths_per_test'] = all_data['deaths']/all_data['tests']
all_data['deaths_per_case'] = all_data['deaths']/all_data['cases']
# concatenate NUTS code and region name
all_data['region'] = all_data['region'] + ' - ' + all_data['name']
# exclude immature data
all_data = all_data[all_data['date'] <= '2021-03-01'] # w/o Mar 7, 2021
all_data = all_data[all_data['date'] >= '2020-03-16']
# visualise trends in all regions
fig = px.line(all_data[pd.notnull(all_data["cases"])], x="date", y="cases", color="region",
title="Weekly Cases by Region")
fig.show()
fig = px.line(all_data[pd.notnull(all_data["deaths"])], x="date", y="deaths", color="region",
title="Weekly Deaths by Region")
fig.show()
fig = px.line(all_data[pd.notnull(all_data["cases_100K"])], x="date", y="cases_100K", color="region",
title="Weekly Cases per 100K Capita by Region")
fig.show()
fig = px.line(all_data[pd.notnull(all_data["deaths_100K"])], x="date", y="deaths_100K", color="region",
title="Weekly Deaths per 100K Capita by Region")
fig.show()
fig = px.line(all_data[pd.notnull(all_data["tests"])], x="date", y="tests", color="region",
title="Weekly Tests by Region")
fig.show()
fig = px.line(all_data[pd.notnull(all_data["cases_per_test"])], x="date", y="cases_per_test", color="region",
title="Weekly Cases per Test by Region")
fig.show()
fig = px.line(all_data[pd.notnull(all_data["deaths_per_test"])], x="date", y="deaths_per_test", color="region",
title="Weekly Deaths per Test by Region")
fig.show()
fig = px.line(all_data[pd.notnull(all_data["deaths_per_case"])], x="date", y="deaths_per_case", color="region",
title="Weekly Deaths per Case by Region")
fig.show()